notebook.community

Edit and run



In [1]:

    
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



In [2]:

    
df = pd.read_csv("HR_Analytics.csv")
df.tail()









    Out[2]:






  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
      sales
      salary
    
  
  
    
      14994
      0.40
      0.57
      2
      151
      3
      0
      1
      0
      support
      low
    
    
      14995
      0.37
      0.48
      2
      160
      3
      0
      1
      0
      support
      low
    
    
      14996
      0.37
      0.53
      2
      143
      3
      0
      1
      0
      support
      low
    
    
      14997
      0.11
      0.96
      6
      280
      4
      0
      1
      0
      support
      low
    
    
      14998
      0.37
      0.52
      2
      158
      3
      0
      1
      0
      support
      low



In [3]:

    
# Change the column name from 'sales' to 'Department' because that column have same value(sales)
df=df.rename(columns={'sales':'Department'})
df.tail()









    Out[3]:






  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
      Department
      salary
    
  
  
    
      14994
      0.40
      0.57
      2
      151
      3
      0
      1
      0
      support
      low
    
    
      14995
      0.37
      0.48
      2
      160
      3
      0
      1
      0
      support
      low
    
    
      14996
      0.37
      0.53
      2
      143
      3
      0
      1
      0
      support
      low
    
    
      14997
      0.11
      0.96
      6
      280
      4
      0
      1
      0
      support
      low
    
    
      14998
      0.37
      0.52
      2
      158
      3
      0
      1
      0
      support
      low



In [4]:

    
#colum 'job' & 'salary' encoding
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
df['Department']= le.fit_transform(df['Department'])
#df['salary']= le.fit_transform(df['salary'])

df['salary'].replace(['low', 'medium', 'high'], [0, 1, 2], inplace = True)  
df.tail()









    Out[4]:






  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
      Department
      salary
    
  
  
    
      14994
      0.40
      0.57
      2
      151
      3
      0
      1
      0
      8
      0
    
    
      14995
      0.37
      0.48
      2
      160
      3
      0
      1
      0
      8
      0
    
    
      14996
      0.37
      0.53
      2
      143
      3
      0
      1
      0
      8
      0
    
    
      14997
      0.11
      0.96
      6
      280
      4
      0
      1
      0
      8
      0
    
    
      14998
      0.37
      0.52
      2
      158
      3
      0
      1
      0
      8
      0



In [22]:

    
sns.pairplot(df)
plt.show()



In [23]:

    
df1 = df.drop('left',1)
corr_data = pd.DataFrame.corr(df1)
sns.heatmap(corr_data, annot=True, linewidths=1)
plt.show()

feature간의 다중공선성이 적음



In [19]:

    
from sklearn.ensemble import ExtraTreesClassifier
X = df.drop('left', 1)
y = df['left']
forest = ExtraTreesClassifier()
forest.fit(X, y)

importances = forest.feature_importances_

std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()









    



Feature ranking:
1. feature 0 (0.274515)
2. feature 2 (0.234442)
3. feature 4 (0.171028)
4. feature 3 (0.139046)
5. feature 1 (0.125374)
6. feature 7 (0.019887)
7. feature 8 (0.017474)
8. feature 5 (0.015668)
9. feature 6 (0.002567)

독립 변수의 중요도 계산하기 위해 랜덤 포레스트 이용
포레스트 안에서 사용된 모든 노드에 대해 IG를 구할 수 있으므로 IG의 평균을 비교하면 어떤 독립변수가 중요한지를 비교할 수 있다.
satisfaction_level => number_project => time_spend_company => average_montly_hours => last_evaluation => Department => work_accident => salary => promotion_last_5years 순서로 직장을 떠나는 것에 영향을 미침
흥미로운 점은 salary가 이직률에 큰영향을 미치지 않은 것임

중요 변수 3개와 left 간의 관계



In [16]:

    
sns.barplot(x = 'left', y = 'satisfaction_level', data = df)
sns.plt.title('Left over satisfaction_level')
#sns.barplot(df['left'], df['satisfaction_level'])
plt.show()



In [41]:

    
facet = sns.FacetGrid(df, hue="left",aspect=2)
facet = (facet.map(sns.kdeplot,'satisfaction_level', shade = True).add_legend())
plt.show()

만족도가 높을 수록 직장을 떠나는 비중이 낮음
만족도가 0.1, 0.4, 0.8일때 떠나는 비중이 높음



In [11]:

    
sns.barplot(x = 'number_project', y = 'left', data = df)
sns.plt.title('Left over Number of project')
plt.show()



In [13]:

    
facet = sns.FacetGrid(df, hue="left",aspect=2)
facet = (facet.map(sns.kdeplot,'number_project', shade = True).add_legend())
plt.show()

프로젝트 수가 7일 때 이직률이 100%이다.
직원들에게 프로젝트 수를 맡길 때 3~4개가 적당한 것으로 보여진다.



In [20]:

    
sns.barplot(x = 'time_spend_company', y = 'left', data = df)
sns.plt.title('Left over time_spend_company')
#sns.barplot(df['left'], df['satisfaction_level'])
plt.show()



In [19]:

    
facet = sns.FacetGrid(df, hue="left",aspect=2)
facet = (facet.map(sns.kdeplot,'time_spend_company', shade = True).add_legend())
plt.show()

회사에 오래 남아 있는 사람일 수록(7시간 이상) 이직률이 낮음

나머지 feature들 중 흥미로웠던 점들



In [33]:

    
facet = sns.FacetGrid(df, hue="left",aspect=2)
facet = (facet.map(sns.kdeplot,'last_evaluation', shade = True).add_legend())
plt.show()



In [34]:

    
sns.countplot(x='promotion_last_5years', hue="left", data = df)
sns.plt.title('Left over promotion_last_5years')
plt.show()

지난 5년간 promotion을 한 사람들중 퇴사를 한 사람은 없었다.



In [ ]:

ML

Train, Test split



In [37]:

    
from sklearn.model_selection import train_test_split
y = df['left']
X = df.drop('left',1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state= 42 )

Logistic Regression



In [38]:

    
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
logistic_score_train = logistic.score(X_train, y_train)
print "Train Score : ", logistic_score_train
logistic_score_test = logistic.score(X_test, y_test)
print "-" * 40
print "Test Score : ", logistic_score_test









    



Train Score :  0.795601552393
----------------------------------------
Test Score :  0.786464646465



In [39]:

    
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
svm_score_train = svm.score(X_train, y_train)
print "Train Score : ", svm_score_train
svm_score_test = svm.score(X_test, y_test)
print "-" * 40
print "Test Score : ", svm_score_test









    



Train Score :  0.96347895313
----------------------------------------
Test Score :  0.947474747475

Decision tree



In [40]:

    
from sklearn import tree
decision = tree.DecisionTreeClassifier()
decision.fit(X_train, y_train)
decision_score_train = decision.score(X_train, y_train)
print "Train score : ",decision_score_train
decision_score_test = decision.score(X_test, y_test)
print "-" * 40
print "Test score : ",decision_score_test









    



Train score :  1.0
----------------------------------------
Test score :  0.971313131313

Random forest



In [49]:

    
from sklearn.ensemble import RandomForestClassifier
random = RandomForestClassifier()
random.fit(X_train, y_train)
random_score_train = random.score(X_train, y_train)
print "Training score : ",random_score_train
random_score_test = random.score(X_test, y_test)
print "-" * 40
print "Testing score : ",random_score_test









    



Training score :  0.997810727436
----------------------------------------
Testing score :  0.985454545455

Model comparing



In [45]:

    
models = pd.DataFrame({
        'Model'          : ['Logistic Regression', 'SVM', 'Decision Tree', 'Random Forest'],
        'Train_Score' : [logistic_score_train, svm_score_train, decision_score_train, random_score_train],
        'Test_Score'  : [logistic_score_test, svm_score_test, decision_score_test, random_score_test]
    })
models.sort_values(by='Test_Score', ascending=True)









    Out[45]:






  
    
      
      Model
      Test_Score
      Train_Score
    
  
  
    
      0
      Logistic Regression
      0.786465
      0.795602
    
    
      1
      SVM
      0.947475
      0.963479
    
    
      2
      Decision Tree
      0.971313
      1.000000
    
    
      3
      Random Forest
      0.985051
      0.998209



In [ ]:



In [ ]:

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	left	sales	salary
14994	0.40	0.57	2	151	3	1	support	low
14995	0.37	0.48	2	160	3	1	support	low
14996	0.37	0.53	2	143	3	1	support	low
14997	0.11	0.96	6	280	4	1	support	low
14998	0.37	0.52	2	158	3	1	support	low

	Model	Test_Score	Train_Score
0	Logistic Regression	0.786465	0.795602
1	SVM	0.947475	0.963479
2	Decision Tree	0.971313	1.000000
3	Random Forest	0.985051	0.998209